Estimators - Core | SynapseML

Version: 0.11.4

AutoML

FindBestModel

Python
Scala

from synapse.ml.automl import *
from synapse.ml.train import *
from pyspark.ml.classification import RandomForestClassifier

df = (spark.createDataFrame([
    (0, 2, 0.50, 0.60, 0),
    (1, 3, 0.40, 0.50, 1),
    (0, 4, 0.78, 0.99, 2),
    (1, 5, 0.12, 0.34, 3),
    (0, 1, 0.50, 0.60, 0),
    (1, 3, 0.40, 0.50, 1),
    (0, 3, 0.78, 0.99, 2),
    (1, 4, 0.12, 0.34, 3),
    (0, 0, 0.50, 0.60, 0),
    (1, 2, 0.40, 0.50, 1),
    (0, 3, 0.78, 0.99, 2),
    (1, 4, 0.12, 0.34, 3)
], ["Label", "col1", "col2", "col3", "col4"]))

# mocking models
randomForestClassifier = (TrainClassifier()
      .setModel(RandomForestClassifier()
        .setMaxBins(32)
        .setMaxDepth(5)
        .setMinInfoGain(0.0)
        .setMinInstancesPerNode(1)
        .setNumTrees(20)
        .setSubsamplingRate(1.0)
        .setSeed(0))
      .setFeaturesCol("mlfeatures")
      .setLabelCol("Label"))
model = randomForestClassifier.fit(df)

findBestModel = (FindBestModel()
  .setModels([model, model])
  .setEvaluationMetric("accuracy"))
bestModel = findBestModel.fit(df)
bestModel.transform(df).show()

import com.microsoft.azure.synapse.ml.automl._
import com.microsoft.azure.synapse.ml.train._
import spark.implicits._
import org.apache.spark.ml.Transformer

val df = (Seq(
      (0, 2, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 4, 0.78, 0.99, 2),
      (1, 5, 0.12, 0.34, 3),
      (0, 1, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3),
      (0, 0, 0.50, 0.60, 0),
      (1, 2, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3)
  ).toDF("Label", "col1", "col2", "col3", "col4"))

// mocking models
val randomForestClassifier = (new TrainClassifier()
      .setModel(
        new RandomForestClassifier()
        .setMaxBins(32)
        .setMaxDepth(5)
        .setMinInfoGain(0.0)
        .setMinInstancesPerNode(1)
        .setNumTrees(20)
        .setSubsamplingRate(1.0)
        .setSeed(0L))
      .setFeaturesCol("mlfeatures")
      .setLabelCol("Label"))
val model = randomForestClassifier.fit(df)

val findBestModel = (new FindBestModel()
  .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer]))
  .setEvaluationMetric("accuracy"))
val bestModel = findBestModel.fit(df)
bestModel.transform(df).show()

Python API: FindBestModel

Scala API: FindBestModel

Source: FindBestModel

TuneHyperparameters

Python
Scala

from synapse.ml.automl import *
from synapse.ml.train import *
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier


df = (spark.createDataFrame([
    (0, 1, 1, 1, 1, 1, 1.0, 3, 1, 1),
    (0, 1, 1, 1, 1, 2, 1.0, 1, 1, 1),
    (0, 1, 1, 1, 1, 2, 1.0, 2, 1, 1),
    (0, 1, 2, 3, 1, 2, 1.0, 3, 1, 1),
    (0, 3, 1, 1, 1, 2, 1.0, 3, 1, 1)
], ["Label", "Clump_Thickness", "Uniformity_of_Cell_Size",
    "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses"]))

logReg = LogisticRegression()
randForest = RandomForestClassifier()
gbt = GBTClassifier()
smlmodels = [logReg, randForest, gbt]
mmlmodels = [TrainClassifier(model=model, labelCol="Label") for model in smlmodels]

paramBuilder = (HyperparamBuilder()
    .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3))
    .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5,10]))
    .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3,5]))
    .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8,16))
    .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3,5])))
searchSpace = paramBuilder.build()
# The search space is a list of params to tuples of estimator and hyperparam
randomSpace = RandomSpace(searchSpace)

bestModel = TuneHyperparameters(
              evaluationMetric="accuracy", models=mmlmodels, numFolds=2,
              numRuns=len(mmlmodels) * 2, parallelism=2,
              paramSpace=randomSpace.space(), seed=0).fit(df)

import com.microsoft.azure.synapse.ml.automl._
import com.microsoft.azure.synapse.ml.train._
import spark.implicits._

val logReg = new LogisticRegression()
val randForest = new RandomForestClassifier()
val gbt = new GBTClassifier()
val smlmodels = Seq(logReg, randForest, gbt)
val mmlmodels = smlmodels.map(model => new TrainClassifier().setModel(model).setLabelCol("Label"))

val paramBuilder = new HyperparamBuilder()
  .addHyperparam(logReg.regParam, new DoubleRangeHyperParam(0.1, 0.3))
  .addHyperparam(randForest.numTrees, new DiscreteHyperParam(List(5,10)))
  .addHyperparam(randForest.maxDepth, new DiscreteHyperParam(List(3,5)))
  .addHyperparam(gbt.maxBins, new IntRangeHyperParam(8,16))
.addHyperparam(gbt.maxDepth, new DiscreteHyperParam(List(3,5)))
val searchSpace = paramBuilder.build()
val randomSpace = new RandomSpace(searchSpace)

val dataset: DataFrame = Seq(
  (0, 1, 1, 1, 1, 1, 1.0, 3, 1, 1),
  (0, 1, 1, 1, 1, 2, 1.0, 1, 1, 1),
  (0, 1, 1, 1, 1, 2, 1.0, 2, 1, 1),
  (0, 1, 2, 3, 1, 2, 1.0, 3, 1, 1),
  (0, 3, 1, 1, 1, 2, 1.0, 3, 1, 1))
  .toDF("Label", "Clump_Thickness", "Uniformity_of_Cell_Size",
    "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses")

val tuneHyperparameters = new TuneHyperparameters().setEvaluationMetric("accuracy")
  .setModels(mmlmodels.toArray).setNumFolds(2).setNumRuns(mmlmodels.length * 2)
  .setParallelism(1).setParamSpace(randomSpace).setSeed(0)
tuneHyperparameters.fit(dataset).show()

Python API: TuneHyperparameters

Scala API: TuneHyperparameters

Source: TuneHyperparameters

Featurize

CleanMissingData

Python
Scala

from synapse.ml.featurize import *

dataset = spark.createDataFrame([
    (0,    2,    0.50, 0.60, 0),
    (1,    3,    0.40, None, None),
    (0,    4,    0.78, 0.99, 2),
    (1,    5,    0.12, 0.34, 3),
    (0,    1,    0.50, 0.60, 0),
    (None, None, None, None, None),
    (0,    3,    0.78, 0.99, 2),
    (1,    4,    0.12, 0.34, 3),
    (0,    None, 0.50, 0.60, 0),
    (1,    2,    0.40, 0.50, None),
    (0,    3,    None, 0.99, 2),
    (1,    4,    0.12, 0.34, 3)
], ["col1", "col2", "col3", "col4", "col5"])

cmd = (CleanMissingData()
      .setInputCols(dataset.columns)
      .setOutputCols(dataset.columns)
      .setCleaningMode("Mean"))

import com.microsoft.azure.synapse.ml.featurize._
import java.lang.{Boolean => JBoolean, Double => JDouble, Integer => JInt}
import spark.implicits._

def createMockDataset: DataFrame = {
    Seq[(JInt, JInt, JDouble, JDouble, JInt)](
      (0,    2,    0.50, 0.60, 0),
      (1,    3,    0.40, null, null),
      (0,    4,    0.78, 0.99, 2),
      (1,    5,    0.12, 0.34, 3),
      (0,    1,    0.50, 0.60, 0),
      (null, null, null, null, null),
      (0,    3,    0.78, 0.99, 2),
      (1,    4,    0.12, 0.34, 3),
      (0,    null, 0.50, 0.60, 0),
      (1,    2,    0.40, 0.50, null),
      (0,    3,    null, 0.99, 2),
      (1,    4,    0.12, 0.34, 3))
      .toDF("col1", "col2", "col3", "col4", "col5")
  }

val dataset = createMockDataset
val cmd = (new CleanMissingData()
      .setInputCols(dataset.columns)
      .setOutputCols(dataset.columns)
      .setCleaningMode("Mean"))

Python API: CleanMissingData

Scala API: CleanMissingData

Source: CleanMissingData

CountSelector

Python
Scala

from synapse.ml.featurize import *
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.sparse(3, [(0, 1.0), (2, 2.0)]), Vectors.dense(1.0, 0.1, 0)),
    (Vectors.sparse(3, [(0, 1.0), (2, 2.0)]), Vectors.dense(1.0, 0.1, 0))
], ["col1", "col2"])

cs = CountSelector().setInputCol("col1").setOutputCol("col3")

cs.fit(df).transform(df).show()

import com.microsoft.azure.synapse.ml.featurize._
import org.apache.spark.ml.linalg.Vectors
import spark.implicits._

val df = Seq(
    (Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), Vectors.dense(1.0, 0.1, 0)),
    (Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), Vectors.dense(1.0, 0.1, 0))
  ).toDF("col1", "col2")

val cs = (new CountSelector()
            .setInputCol("col1")
            .setOutputCol("col3"))

cs.fit(df).transform(df).show()

Python API: CountSelector

Scala API: CountSelector

Source: CountSelector

Featurize

Python
Scala

from synapse.ml.featurize import *

dataset = spark.createDataFrame([
    (0, 2, 0.50, 0.60, "pokemon are everywhere"),
    (1, 3, 0.40, 0.50, "they are in the woods"),
    (0, 4, 0.78, 0.99, "they are in the water"),
    (1, 5, 0.12, 0.34, "they are in the fields"),
    (0, 3, 0.78, 0.99, "pokemon - gotta catch em all")
], ["Label", "col1", "col2", "col3"])

feat = (Featurize()
      .setNumFeatures(10)
      .setOutputCol("testColumn")
      .setInputCols(["col1", "col2", "col3"])
      .setOneHotEncodeCategoricals(False))

feat.fit(dataset).transform(dataset).show()

import com.microsoft.azure.synapse.ml.featurize._
import spark.implicits._

val dataset = Seq(
      (0, 2, 0.50, 0.60, "pokemon are everywhere"),
      (1, 3, 0.40, 0.50, "they are in the woods"),
      (0, 4, 0.78, 0.99, "they are in the water"),
      (1, 5, 0.12, 0.34, "they are in the fields"),
      (0, 3, 0.78, 0.99, "pokemon - gotta catch em all")).toDF("Label", "col1", "col2", "col3")

val featureColumns = dataset.columns.filter(_ != "Label")

val feat = (new Featurize()
      .setNumFeatures(10)
      .setOutputCol("testColumn")
      .setInputCols(featureColumns)
      .setOneHotEncodeCategoricals(false))

feat.fit(dataset).transform(dataset).show()

Python API: Featurize

Scala API: Featurize

Source: Featurize

ValueIndexer

Python
Scala

from synapse.ml.featurize import *

df = spark.createDataFrame([
    (-3, 24, 0.32534, True, "piano"),
    (1, 5, 5.67, False, "piano"),
    (-3, 5, 0.32534, False, "guitar")
], ["int", "long", "double", "bool", "string"])

vi = ValueIndexer().setInputCol("string").setOutputCol("string_cat")

vi.fit(df).transform(df).show()

import com.microsoft.azure.synapse.ml.featurize._
import spark.implicits._

val df = Seq[(Int, Long, Double, Boolean, String)](
    (-3, 24L, 0.32534, true, "piano"),
    (1, 5L, 5.67, false, "piano"),
    (-3, 5L, 0.32534, false, "guitar")).toDF("int", "long", "double", "bool", "string")

val vi = new ValueIndexer().setInputCol("string").setOutputCol("string_cat")

vi.fit(df).transform(df).show()

Python API: ValueIndexer

Scala API: ValueIndexer

Source: ValueIndexer

Featurize Text

TextFeaturizer

Python
Scala

from synapse.ml.featurize.text import *

dfRaw = spark.createDataFrame([
    (0, "Hi I"),
    (1, "I wish for snow today"),
    (2, "we Cant go to the park, because of the snow!"),
    (3, "")
], ["label", "sentence"])

tfRaw = (TextFeaturizer()
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20))

tfRaw.fit(dfRaw).transform(dfRaw).show()

import com.microsoft.azure.synapse.ml.featurize.text._
import spark.implicits._

val dfRaw = Seq((0, "Hi I"),
            (1, "I wish for snow today"),
            (2, "we Cant go to the park, because of the snow!"),
            (3, "")).toDF("label", "sentence")

val tfRaw = (new TextFeaturizer()
      .setInputCol("sentence")
      .setOutputCol("features")
      .setNumFeatures(20))

tfRaw.fit(dfRaw).transform(dfRaw).show()

Python API: TextFeaturizer

Scala API: TextFeaturizer

Source: TextFeaturizer

Isolation Forest

IsolationForest

Python
Scala

from synapse.ml.isolationforest import *

isolationForest = (IsolationForest()
      .setNumEstimators(100)
      .setBootstrap(False)
      .setMaxSamples(256)
      .setMaxFeatures(1.0)
      .setFeaturesCol("features")
      .setPredictionCol("predictedLabel")
      .setScoreCol("outlierScore")
      .setContamination(0.02)
      .setContaminationError(0.02 * 0.01)
      .setRandomSeed(1))

import com.microsoft.azure.synapse.ml.isolationforest._
import spark.implicits._

val isolationForest = (new IsolationForest()
      .setNumEstimators(100)
      .setBootstrap(false)
      .setMaxSamples(256)
      .setMaxFeatures(1.0)
      .setFeaturesCol("features")
      .setPredictionCol("predictedLabel")
      .setScoreCol("outlierScore")
      .setContamination(0.02)
      .setContaminationError(0.02 * 0.01)
      .setRandomSeed(1))

Python API: IsolationForest

Scala API: IsolationForest

Source: IsolationForest

NN

ConditionalKNN

Python
Scala

from synapse.ml.nn import *

cknn = (ConditionalKNN()
      .setOutputCol("matches")
      .setFeaturesCol("features"))

import com.microsoft.azure.synapse.ml.nn._
import spark.implicits._

val cknn = (new ConditionalKNN()
            .setOutputCol("matches")
            .setFeaturesCol("features"))

Python API: ConditionalKNN

Scala API: ConditionalKNN

Source: ConditionalKNN

KNN

Python
Scala

from synapse.ml.nn import *

knn = (KNN()
      .setOutputCol("matches"))

import com.microsoft.azure.synapse.ml.nn._
import spark.implicits._

val knn = (new KNN()
      .setOutputCol("matches"))

Python API: KNN

Scala API: KNN

Source: KNN

Recommendation

RecommendationIndexer, RankingEvaluator, RankingAdapter and RankingTrainValidationSplit

Python
Scala

from synapse.ml.recommendation import *
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import *

ratings = (spark.createDataFrame([
      ("11", "Movie 01", 2),
      ("11", "Movie 03", 1),
      ("11", "Movie 04", 5),
      ("11", "Movie 05", 3),
      ("11", "Movie 06", 4),
      ("11", "Movie 07", 1),
      ("11", "Movie 08", 5),
      ("11", "Movie 09", 3),
      ("22", "Movie 01", 4),
      ("22", "Movie 02", 5),
      ("22", "Movie 03", 1),
      ("22", "Movie 05", 3),
      ("22", "Movie 06", 3),
      ("22", "Movie 07", 5),
      ("22", "Movie 08", 1),
      ("22", "Movie 10", 3),
      ("33", "Movie 01", 4),
      ("33", "Movie 03", 1),
      ("33", "Movie 04", 5),
      ("33", "Movie 05", 3),
      ("33", "Movie 06", 4),
      ("33", "Movie 08", 1),
      ("33", "Movie 09", 5),
      ("33", "Movie 10", 3),
      ("44", "Movie 01", 4),
      ("44", "Movie 02", 5),
      ("44", "Movie 03", 1),
      ("44", "Movie 05", 3),
      ("44", "Movie 06", 4),
      ("44", "Movie 07", 5),
      ("44", "Movie 08", 1),
      ("44", "Movie 10", 3)
      ], ["customerIDOrg", "itemIDOrg", "rating"])
    .dropDuplicates()
    .cache())

recommendationIndexer = (RecommendationIndexer()
    .setUserInputCol("customerIDOrg")
    .setUserOutputCol("customerID")
    .setItemInputCol("itemIDOrg")
    .setItemOutputCol("itemID")
    .setRatingCol("rating"))

transformedDf = (recommendationIndexer.fit(ratings)
    .transform(ratings).cache())

als = (ALS()
    .setNumUserBlocks(1)
    .setNumItemBlocks(1)
    .setUserCol("customerID")
    .setItemCol("itemID")
    .setRatingCol("rating")
    .setSeed(0))

evaluator = (RankingEvaluator()
    .setK(3)
    .setNItems(10))

adapter = (RankingAdapter()
    .setK(evaluator.getK())
    .setRecommender(als))

adapter.fit(transformedDf).transform(transformedDf).show()

paramGrid = (ParamGridBuilder()
    .addGrid(als.regParam, [1.0])
    .build())

tvRecommendationSplit = (RankingTrainValidationSplit()
      .setEstimator(als)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setTrainRatio(0.8)
      .setUserCol(recommendationIndexer.getUserOutputCol())
      .setItemCol(recommendationIndexer.getItemOutputCol())
      .setRatingCol("rating"))

tvRecommendationSplit.fit(transformedDf).transform(transformedDf).show()

import com.microsoft.azure.synapse.ml.recommendation._
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.tuning._
import spark.implicits._

val ratings = (Seq(
      ("11", "Movie 01", 2),
      ("11", "Movie 03", 1),
      ("11", "Movie 04", 5),
      ("11", "Movie 05", 3),
      ("11", "Movie 06", 4),
      ("11", "Movie 07", 1),
      ("11", "Movie 08", 5),
      ("11", "Movie 09", 3),
      ("22", "Movie 01", 4),
      ("22", "Movie 02", 5),
      ("22", "Movie 03", 1),
      ("22", "Movie 05", 3),
      ("22", "Movie 06", 3),
      ("22", "Movie 07", 5),
      ("22", "Movie 08", 1),
      ("22", "Movie 10", 3),
      ("33", "Movie 01", 4),
      ("33", "Movie 03", 1),
      ("33", "Movie 04", 5),
      ("33", "Movie 05", 3),
      ("33", "Movie 06", 4),
      ("33", "Movie 08", 1),
      ("33", "Movie 09", 5),
      ("33", "Movie 10", 3),
      ("44", "Movie 01", 4),
      ("44", "Movie 02", 5),
      ("44", "Movie 03", 1),
      ("44", "Movie 05", 3),
      ("44", "Movie 06", 4),
      ("44", "Movie 07", 5),
      ("44", "Movie 08", 1),
      ("44", "Movie 10", 3))
    .toDF("customerIDOrg", "itemIDOrg", "rating")
    .dropDuplicates()
    .cache())

val recommendationIndexer = (new RecommendationIndexer()
    .setUserInputCol("customerIDOrg")
    .setUserOutputCol("customerID")
    .setItemInputCol("itemIDOrg")
    .setItemOutputCol("itemID")
    .setRatingCol("rating"))

val transformedDf = (recommendationIndexer.fit(ratings)
    .transform(ratings).cache())

val als = (new ALS()
    .setNumUserBlocks(1)
    .setNumItemBlocks(1)
    .setUserCol("customerID")
    .setItemCol("itemID")
    .setRatingCol("rating")
    .setSeed(0))

val evaluator = (new RankingEvaluator()
    .setK(3)
    .setNItems(10))

val adapter = (new RankingAdapter()
    .setK(evaluator.getK)
    .setRecommender(als))

adapter.fit(transformedDf).transform(transformedDf).show()

val paramGrid = (new ParamGridBuilder()
    .addGrid(als.regParam, Array(1.0))
    .build())

val tvRecommendationSplit = (new RankingTrainValidationSplit()
      .setEstimator(als)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setTrainRatio(0.8)
      .setUserCol(recommendationIndexer.getUserOutputCol)
      .setItemCol(recommendationIndexer.getItemOutputCol)
      .setRatingCol("rating"))

tvRecommendationSplit.fit(transformedDf).transform(transformedDf).show()

Python API: RecommendationIndexer

Scala API: RecommendationIndexer

Source: RecommendationIndexer

Python API: RankingEvaluator

Scala API: RankingEvaluator

Source: RankingEvaluator

Python API: RankingAdapter

Scala API: RankingAdapter

Source: RankingAdapter

Python API: RankingTrainValidationSplit

Scala API: RankingTrainValidationSplit

Source: RankingTrainValidationSplit

SAR

Python
Scala

from synapse.ml.recommendation import *

ratings = (spark.createDataFrame([
      ("11", "Movie 01", 2),
      ("11", "Movie 03", 1),
      ("11", "Movie 04", 5),
      ("11", "Movie 05", 3),
      ("11", "Movie 06", 4),
      ("11", "Movie 07", 1),
      ("11", "Movie 08", 5),
      ("11", "Movie 09", 3),
      ("22", "Movie 01", 4),
      ("22", "Movie 02", 5),
      ("22", "Movie 03", 1),
      ("22", "Movie 05", 3),
      ("22", "Movie 06", 3),
      ("22", "Movie 07", 5),
      ("22", "Movie 08", 1),
      ("22", "Movie 10", 3),
      ("33", "Movie 01", 4),
      ("33", "Movie 03", 1),
      ("33", "Movie 04", 5),
      ("33", "Movie 05", 3),
      ("33", "Movie 06", 4),
      ("33", "Movie 08", 1),
      ("33", "Movie 09", 5),
      ("33", "Movie 10", 3),
      ("44", "Movie 01", 4),
      ("44", "Movie 02", 5),
      ("44", "Movie 03", 1),
      ("44", "Movie 05", 3),
      ("44", "Movie 06", 4),
      ("44", "Movie 07", 5),
      ("44", "Movie 08", 1),
      ("44", "Movie 10", 3)
      ], ["customerIDOrg", "itemIDOrg", "rating"])
    .dropDuplicates()
    .cache())

recommendationIndexer = (RecommendationIndexer()
    .setUserInputCol("customerIDOrg")
    .setUserOutputCol("customerID")
    .setItemInputCol("itemIDOrg")
    .setItemOutputCol("itemID")
    .setRatingCol("rating"))

algo = (SAR()
      .setUserCol("customerID")
      .setItemCol("itemID")
      .setRatingCol("rating")
      .setTimeCol("timestamp")
      .setSupportThreshold(1)
      .setSimilarityFunction("jacccard")
      .setActivityTimeFormat("EEE MMM dd HH:mm:ss Z yyyy"))

adapter = (RankingAdapter()
      .setK(5)
      .setRecommender(algo))

res1 = recommendationIndexer.fit(ratings).transform(ratings).cache()

adapter.fit(res1).transform(res1).show()

import com.microsoft.azure.synapse.ml.recommendation._
import spark.implicits._

val ratings = (Seq(
      ("11", "Movie 01", 2),
      ("11", "Movie 03", 1),
      ("11", "Movie 04", 5),
      ("11", "Movie 05", 3),
      ("11", "Movie 06", 4),
      ("11", "Movie 07", 1),
      ("11", "Movie 08", 5),
      ("11", "Movie 09", 3),
      ("22", "Movie 01", 4),
      ("22", "Movie 02", 5),
      ("22", "Movie 03", 1),
      ("22", "Movie 05", 3),
      ("22", "Movie 06", 3),
      ("22", "Movie 07", 5),
      ("22", "Movie 08", 1),
      ("22", "Movie 10", 3),
      ("33", "Movie 01", 4),
      ("33", "Movie 03", 1),
      ("33", "Movie 04", 5),
      ("33", "Movie 05", 3),
      ("33", "Movie 06", 4),
      ("33", "Movie 08", 1),
      ("33", "Movie 09", 5),
      ("33", "Movie 10", 3),
      ("44", "Movie 01", 4),
      ("44", "Movie 02", 5),
      ("44", "Movie 03", 1),
      ("44", "Movie 05", 3),
      ("44", "Movie 06", 4),
      ("44", "Movie 07", 5),
      ("44", "Movie 08", 1),
      ("44", "Movie 10", 3))
    .toDF("customerIDOrg", "itemIDOrg", "rating")
    .dropDuplicates()
    .cache())

val recommendationIndexer = (new RecommendationIndexer()
    .setUserInputCol("customerIDOrg")
    .setUserOutputCol("customerID")
    .setItemInputCol("itemIDOrg")
    .setItemOutputCol("itemID")
    .setRatingCol("rating"))

val algo = (new SAR()
      .setUserCol("customerID")
      .setItemCol("itemID")
      .setRatingCol("rating")
      .setTimeCol("timestamp")
      .setSupportThreshold(1)
      .setSimilarityFunction("jacccard")
      .setActivityTimeFormat("EEE MMM dd HH:mm:ss Z yyyy"))

val adapter = (new RankingAdapter()
      .setK(5)
      .setRecommender(algo))

val res1 = recommendationIndexer.fit(ratings).transform(ratings).cache()

adapter.fit(res1).transform(res1).show()

Python API: SAR

Scala API: SAR

Source: SAR

Stages

ClassBalancer

Python
Scala

from synapse.ml.stages import *

df = (spark.createDataFrame([
      (0, 1.0, "Hi I"),
      (1, 1.0, "I wish for snow today"),
      (2, 2.0, "I wish for snow today"),
      (3, 2.0, "I wish for snow today"),
      (4, 2.0, "I wish for snow today"),
      (5, 2.0, "I wish for snow today"),
      (6, 0.0, "I wish for snow today"),
      (7, 1.0, "I wish for snow today"),
      (8, 0.0, "we Cant go to the park, because of the snow!"),
      (9, 2.0, "")
      ], ["index", "label", "sentence"]))

cb = ClassBalancer().setInputCol("label")

cb.fit(df).transform(df).show()

import com.microsoft.azure.synapse.ml.stages._

val df = Seq(
      (0, 1.0, "Hi I"),
      (1, 1.0, "I wish for snow today"),
      (2, 2.0, "I wish for snow today"),
      (3, 2.0, "I wish for snow today"),
      (4, 2.0, "I wish for snow today"),
      (5, 2.0, "I wish for snow today"),
      (6, 0.0, "I wish for snow today"),
      (7, 1.0, "I wish for snow today"),
      (8, 0.0, "we Cant go to the park, because of the snow!"),
      (9, 2.0, "")).toDF("index", "label", "sentence")

val cb = new ClassBalancer().setInputCol("label")

cb.fit(df).transform(df).show()

Python API: ClassBalancer

Scala API: ClassBalancer

Source: ClassBalancer

MultiColumnAdapter

Python
Scala

from synapse.ml.stages import *
from pyspark.ml.feature import Tokenizer

df = (spark.createDataFrame([
        (0, "This is a test", "this is one too"),
        (1, "could be a test", "bar"),
        (2, "foo", "bar"),
        (3, "foo", "maybe not")
      ], ["label", "words1", "words2"]))

stage1 = Tokenizer()
mca = (MultiColumnAdapter()
        .setBaseStage(stage1)
        .setInputCols(["words1",  "words2"])
        .setOutputCols(["output1", "output2"]))

mca.fit(df).transform(df).show()

import com.microsoft.azure.synapse.ml.stages._
import org.apache.spark.ml.feature.Tokenizer

val df = (Seq(
    (0, "This is a test", "this is one too"),
    (1, "could be a test", "bar"),
    (2, "foo", "bar"),
    (3, "foo", "maybe not"))
    .toDF("label", "words1", "words2"))

val stage1 = new Tokenizer()
val mca = (new MultiColumnAdapter()
        .setBaseStage(stage1)
        .setInputCols(Array[String]("words1",  "words2"))
        .setOutputCols(Array[String]("output1", "output2")))

mca.fit(df).transform(df).show()

Python API: MultiColumnAdapter

Scala API: MultiColumnAdapter

Source: MultiColumnAdapter

Timer

Python
Scala

from synapse.ml.stages import *
from pyspark.ml.feature import *

df = (spark.createDataFrame([
        (0, "Hi I"),
        (1, "I wish for snow today"),
        (2, "we Cant go to the park, because of the snow!"),
        (3, "")
      ], ["label", "sentence"]))

tok = (Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens"))

df2 = Timer().setStage(tok).fit(df).transform(df)

df3 = HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2)

idf = IDF().setInputCol("hash").setOutputCol("idf")
timer = Timer().setStage(idf)

timer.fit(df3).transform(df3).show()

import com.microsoft.azure.synapse.ml.stages._
import org.apache.spark.ml.feature._

val df = (Seq(
    (0, "Hi I"),
    (1, "I wish for snow today"),
    (2, "we Cant go to the park, because of the snow!"),
    (3, "")
  ).toDF("label", "sentence"))

val tok = (new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("tokens"))

val df2 = new Timer().setStage(tok).fit(df).transform(df)

val df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2)

val idf = new IDF().setInputCol("hash").setOutputCol("idf")
val timer = new Timer().setStage(idf)

timer.fit(df3).transform(df3).show()

Python API: Timer

Scala API: Timer

Source: Timer

Train

TrainClassifier

Python
Scala

from synapse.ml.train import *
from pyspark.ml.classification import LogisticRegression

df = spark.createDataFrame([
      (0, 2, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 4, 0.78, 0.99, 2),
      (1, 5, 0.12, 0.34, 3),
      (0, 1, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3),
      (0, 0, 0.50, 0.60, 0),
      (1, 2, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3)],
      ["Label", "col1", "col2", "col3", "col4"]
)

tc = (TrainClassifier()
      .setModel(LogisticRegression())
      .setLabelCol("Label"))

tc.fit(df).transform(df).show()

import com.microsoft.azure.synapse.ml.train._
import org.apache.spark.ml.classification.LogisticRegression

val df = (Seq(
      (0, 2, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 4, 0.78, 0.99, 2),
      (1, 5, 0.12, 0.34, 3),
      (0, 1, 0.50, 0.60, 0),
      (1, 3, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3),
      (0, 0, 0.50, 0.60, 0),
      (1, 2, 0.40, 0.50, 1),
      (0, 3, 0.78, 0.99, 2),
      (1, 4, 0.12, 0.34, 3))
      .toDF("Label", "col1", "col2", "col3", "col4"))

val tc = (new TrainClassifier()
      .setModel(new LogisticRegression())
      .setLabelCol("Label"))

tc.fit(df).transform(df).show()

Python API: TrainClassifier

Scala API: TrainClassifier

Source: TrainClassifier

TrainRegressor

Python
Scala

from synapse.ml.train import *
from pyspark.ml.regression import LinearRegression

dataset = (spark.createDataFrame([
    (0.0, 2, 0.50, 0.60, 0.0),
    (1.0, 3, 0.40, 0.50, 1.0),
    (2.0, 4, 0.78, 0.99, 2.0),
    (3.0, 5, 0.12, 0.34, 3.0),
    (0.0, 1, 0.50, 0.60, 0.0),
    (1.0, 3, 0.40, 0.50, 1.0),
    (2.0, 3, 0.78, 0.99, 2.0),
    (3.0, 4, 0.12, 0.34, 3.0),
    (0.0, 0, 0.50, 0.60, 0.0),
    (1.0, 2, 0.40, 0.50, 1.0),
    (2.0, 3, 0.78, 0.99, 2.0),
    (3.0, 4, 0.12, 0.34, 3.0)],
    ["label", "col1", "col2", "col3", "col4"]))

linearRegressor = (LinearRegression()
      .setRegParam(0.3)
      .setElasticNetParam(0.8))
trainRegressor = (TrainRegressor()
      .setModel(linearRegressor)
      .setLabelCol("label"))

trainRegressor.fit(dataset).transform(dataset).show()

import com.microsoft.azure.synapse.ml.train._
import org.apache.spark.ml.regression.LinearRegression

val dataset = (spark.createDataFrame(Seq(
    (0.0, 2, 0.50, 0.60, 0.0),
    (1.0, 3, 0.40, 0.50, 1.0),
    (2.0, 4, 0.78, 0.99, 2.0),
    (3.0, 5, 0.12, 0.34, 3.0),
    (0.0, 1, 0.50, 0.60, 0.0),
    (1.0, 3, 0.40, 0.50, 1.0),
    (2.0, 3, 0.78, 0.99, 2.0),
    (3.0, 4, 0.12, 0.34, 3.0),
    (0.0, 0, 0.50, 0.60, 0.0),
    (1.0, 2, 0.40, 0.50, 1.0),
    (2.0, 3, 0.78, 0.99, 2.0),
    (3.0, 4, 0.12, 0.34, 3.0)))
    .toDF("label", "col1", "col2", "col3", "col4"))

val linearRegressor = (new LinearRegression()
      .setRegParam(0.3)
      .setElasticNetParam(0.8))
val trainRegressor = (new TrainRegressor()
      .setModel(linearRegressor)
      .setLabelCol("label"))

trainRegressor.fit(dataset).transform(dataset).show()

Python API: TrainRegressor

Scala API: TrainRegressor

Source: TrainRegressor

AutoML​

FindBestModel​

TuneHyperparameters​

Featurize​

CleanMissingData​

CountSelector​

Featurize​

ValueIndexer​

Featurize Text​

TextFeaturizer​

Isolation Forest​

IsolationForest​

NN​

ConditionalKNN​

KNN​

Recommendation​

RecommendationIndexer, RankingEvaluator, RankingAdapter and RankingTrainValidationSplit​

SAR​

Stages​

ClassBalancer​

MultiColumnAdapter​

Timer​

Train​

TrainClassifier​

TrainRegressor​

AutoML

FindBestModel

TuneHyperparameters

Featurize

CleanMissingData

CountSelector

Featurize

ValueIndexer

Featurize Text

TextFeaturizer

Isolation Forest

IsolationForest

NN

ConditionalKNN

KNN

Recommendation

RecommendationIndexer, RankingEvaluator, RankingAdapter and RankingTrainValidationSplit

SAR

Stages

ClassBalancer

MultiColumnAdapter

Timer

Train

TrainClassifier

TrainRegressor